OSM vs Google Maps

import sys
import numpy as np
import pandas as pd

from datatable import dt, f, by

from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

from plotly import express as px, io as pio
pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'

from utils import get_path, data_load
sys.path.insert(0, '../')
from secret import API_KEY

Fontes de Dados

Rotas SIH

Colunas

cols_sih = {
  'cod_municipio': 'origem',
  'hosp_cod_municipio': 'destino',
}

Carregando tabela

path_sih = get_path('SIH', 'sih.jay')
df_sih = dt.fread(path_sih, columns=cols_sih)
df_sih.names = cols_sih
df_sih = df_sih[:, list(cols_sih.values())]
df_sih = df_sih[f.origem != f.destino, :]
df_sih['count'] = 1
df_sih = df_sih[:, dt.sum(f.count), by('origem', 'destino')]
df_sih = df_sih.sort(-f.count)
df_sih
origemdestinocount
▪▪▪▪▪▪▪▪▪▪▪▪▪▪▪▪
051084051034022807
131546031062017147
226079026116016012
352125053001015461
428048028003015090
533035033028514875
623037023044014545
732050032053014543
815008015014014521
943230043149014273
1052002553001013577
1135150035528013106
1252014052087012745
1341004041069012717
1452218553001012112
68,5685300105213081
68,5695300105217701
68,5705300105220001
68,5715300105220201
68,5725300105220451

Rotas OSM

Carregando tabela

path_osm = get_path('DISTANCIAS', f'deslocamentos.csv.gzip')
df_osm = pd.read_csv(path_osm)
df_osm
origem destino distancia (km) tempo (min) count
Loading... (need help?)

Rotas Gmaps

Colunas

cols_gmaps = [
  'origem', 'destino', 'count',
  'distance (value)', 'duration (value)',
  'duration_in_traffic (value)',
]

Carregando tabela

path_routes = get_path('GESTANTES', f'gmaps/routes.csv.gzip')
df_files = pd.read_csv(path_routes)
df_files
status origem destino origin_addresses destination_addresses origem_latitude origem_longitude destino_latitude destino_longitude hour departure_time mode traffic_model distance (value) distance (text) duration (value) duration (text) duration_in_traffic (value) duration_in_traffic (text) count
Loading... (need help?)

Selecionando

driving pessimistic at 6pm

status = df_files['status'] == 'OK'
mode = df_files['mode'] == 'driving'
traffic_model = df_files['traffic_model'] == 'pessimistic'
hour = df_files['hour'] == 18
df_gmaps = df_files[status & mode & traffic_model & hour]
df_gmaps = df_gmaps.sort_values(by='count', ascending=False).reset_index(drop=True)
df_gmaps = df_gmaps[cols_gmaps]
df_gmaps
origem destino count distance (value) duration (value) duration_in_traffic (value)
Loading... (need help?)

Gerando tabela

df_sih = df_sih.to_pandas()
df_sih_osm = pd.merge(df_sih, df_osm, on=list(df_sih.columns))
df_sih_gmaps = pd.merge(df_sih, df_gmaps, on=list(df_sih.columns))
df_rotas = pd.merge(df_sih_osm, df_sih_gmaps, on=['origem', 'destino', 'count'])
df_rotas
origem destino count distancia (km) tempo (min) distance (value) duration (value) duration_in_traffic (value)
Loading... (need help?)
df_rotas['distance (value)'] = df_rotas['distance (value)'] / 1000
df_rotas['duration (value)'] = df_rotas['duration (value)'] / 60
df_rotas['duration_in_traffic (value)'] = df_rotas['duration_in_traffic (value)'] / 60
df_rotas
origem destino count distancia (km) tempo (min) distance (value) duration (value) duration_in_traffic (value)
Loading... (need help?)
dist_gmaps = df_rotas['distance (value)']
dist_osm = df_rotas['distancia (km)']
dist_diff = (dist_gmaps - dist_osm) / dist_osm
df_rotas['dist_diff'] = dist_diff
tempo_gmaps = df_rotas['duration (value)']
tempo_traffic_gmaps = df_rotas['duration_in_traffic (value)']
tempo_osm = df_rotas['tempo (min)']
tempo_diff = (tempo_gmaps - tempo_osm) / tempo_osm
df_rotas['tempo_diff'] = tempo_diff
tempo_diff = (tempo_gmaps - tempo_osm) / tempo_osm
tempo_traffic_diff = (tempo_traffic_gmaps - tempo_osm) / tempo_osm
df_rotas['tempo_diff'] = tempo_diff
df_rotas['tempo_traffic_diff'] = tempo_traffic_diff
df_rotas
origem destino count distancia (km) tempo (min) distance (value) duration (value) duration_in_traffic (value) dist_diff tempo_diff tempo_traffic_diff
Loading... (need help?)

Investigação

Completude

df_routes_nan = df_sih_osm.merge(df_sih_gmaps, on=['origem', 'destino', 'count'], how='left')
df_routes_valid = df_routes_nan[~df_routes_nan['distance (value)'].isna()]
df_routes_nan = df_routes_nan[df_routes_nan['distance (value)'].isna()]
df_routes_nan
origem destino count distancia (km) tempo (min) distance (value) duration (value) duration_in_traffic (value)
Loading... (need help?)
df_routes_nan[df_routes_nan['count'] > 1]
origem destino count distancia (km) tempo (min) distance (value) duration (value) duration_in_traffic (value)
Loading... (need help?)
nan = df_routes_nan['count'].sum()
valid = df_routes_valid['count'].sum()

Faltantes

px.bar(x=['nan', 'valid'], y=[nan, valid])
px.histogram(df_routes_nan, x='count')#, range_x=[-.3, .45])

Diferença entre OSM e Gmaps

Distância

Histograma

px.histogram(df_rotas, x='dist_diff', range_x=[-.3, .45])

Scatter

# px.scatter(df_rotas, x='dist_diff', y=, range_x=[-.3, .45])

Tempo (padrão)

Histograma

px.histogram(df_rotas, x='tempo_diff', range_x=[-.75, 1])

Scatter

px.scatter(
  df_rotas,
  x='tempo_diff', y='distancia (km)',
  opacity=.75, size='distance (value)',
  log_x=True, log_y=True,
)

Tempo (tráfego)

Histograma

px.histogram(df_rotas, x='tempo_traffic_diff', range_x=[-.6, 1.5])

Scatter

px.scatter(
  df_rotas,
  x='tempo_traffic_diff', y='distancia (km)',
  opacity=.75, size='distance (value)',
  log_x=True, log_y=True,
)

Exportando tabela

output_route = get_path('GESTANTES', 'routes.csv.gzip')
df_rotas.to_csv(output_route, index=False)
pd.read_csv(output_route)
origem destino count distancia (km) tempo (min) distance (value) duration (value) duration_in_traffic (value) dist_diff tempo_diff tempo_traffic_diff
Loading... (need help?)